{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 3666,
     "status": "ok",
     "timestamp": 1616705610584,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 240
    },
    "id": "JFf-AvwE0llf",
    "outputId": "fd63adab-6106-4b6c-cf60-fae259d15aaa"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "importing Jupyter notebook from SelectIndices.ipynb\n",
      "E:\\Dropbox\\Optimal Training Sets\\Replication File v4\n"
     ]
    }
   ],
   "source": [
    "#Importing the required libraries\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import random\n",
    "\n",
    "import import_ipynb\n",
    "import SelectIndices as si\n",
    "\n",
    "# Set the random seed\n",
    "random.seed(10012)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "executionInfo": {
     "elapsed": 10609,
     "status": "ok",
     "timestamp": 1616705626067,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 240
    },
    "id": "ZBiPUkQLVFCg"
   },
   "outputs": [],
   "source": [
    "dataset_names = ['eo', 'stwts']\n",
    "embed_types = ['cvec_pca16', 'cvec_nmf16', 'cvec_umap16', 'cvec_tsne16', 'lda100', 'bert', 'roberta', 'distil', 'glove6B', 'universal']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 2899685,
     "status": "ok",
     "timestamp": 1616708517734,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 240
    },
    "id": "a5LMcrvMHz4K",
    "outputId": "b54d2e19-b6d5-473c-ce43-4c7c5f20ea1b"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eo_cvec_pca16_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_cvec_nmf16_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_cvec_umap16_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_cvec_tsne16_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_lda100_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_bert_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_roberta_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_distil_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_glove6B_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "eo_universal_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "stwts_cvec_pca16_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "stwts_cvec_nmf16_full.csv\n",
      "Creating kld matrix...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:17: RuntimeWarning: divide by zero encountered in log\n",
      "<string>:17: RuntimeWarning: divide by zero encountered in double_scalars\n",
      "<string>:17: RuntimeWarning: invalid value encountered in double_scalars\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving kld matrix...\n",
      "stwts_cvec_umap16_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "stwts_cvec_tsne16_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "stwts_lda100_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "stwts_bert_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "stwts_roberta_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "stwts_distil_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "stwts_glove6B_full.csv\n",
      "Creating kld matrix...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<string>:17: RuntimeWarning: divide by zero encountered in log\n",
      "<string>:17: RuntimeWarning: divide by zero encountered in double_scalars\n",
      "<string>:17: RuntimeWarning: invalid value encountered in double_scalars\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving kld matrix...\n",
      "stwts_universal_full.csv\n",
      "Creating kld matrix...\n",
      "Saving kld matrix...\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['eo_kld_cvec_pca16.npy',\n",
       " 'eo_kld_cvec_nmf16.npy',\n",
       " 'eo_kld_cvec_umap16.npy',\n",
       " 'eo_kld_cvec_tsne16.npy',\n",
       " 'eo_kld_lda100.npy',\n",
       " 'eo_kld_bert.npy',\n",
       " 'eo_kld_roberta.npy',\n",
       " 'eo_kld_distil.npy',\n",
       " 'eo_kld_glove6B.npy',\n",
       " 'eo_kld_universal.npy',\n",
       " 'stwts_kld_cvec_pca16.npy',\n",
       " 'stwts_kld_cvec_nmf16.npy',\n",
       " 'stwts_kld_cvec_umap16.npy',\n",
       " 'stwts_kld_cvec_tsne16.npy',\n",
       " 'stwts_kld_lda100.npy',\n",
       " 'stwts_kld_bert.npy',\n",
       " 'stwts_kld_roberta.npy',\n",
       " 'stwts_kld_distil.npy',\n",
       " 'stwts_kld_glove6B.npy',\n",
       " 'stwts_kld_universal.npy']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "saved_list = []\n",
    "for i in range(len(dataset_names)):\n",
    "  for j in range(len(embed_types)):\n",
    "    data = pd.read_csv(\"data/output/\"+dataset_names[i] + '_' + embed_types[j] + '_full.csv', index_col=0)\n",
    "    data = data.to_numpy()\n",
    "    print(dataset_names[i] + '_' + embed_types[j] + '_full.csv')\n",
    "    mu_list, sd_list = si.fit_norm(data)\n",
    "    print('Creating kld matrix...')\n",
    "    saved_list.append(si.get_kld_matrix(mu_list, sd_list, dataset_names[i], embed_types[j]))\n",
    "print()\n",
    "saved_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "4FGBrcDGdzoz"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eo_cvec_pca16_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_cvec_nmf16_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_cvec_umap16_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_cvec_tsne16_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_lda100_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_bert_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_roberta_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_distil_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_glove6B_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "eo_universal_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_cvec_pca16_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_cvec_nmf16_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_cvec_umap16_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_cvec_tsne16_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_lda100_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_bert_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_roberta_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_distil_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_glove6B_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "stwts_universal_full.csv\n",
      "Creating ks matrix...\n",
      "Saving ks matrix...\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['eo_ks_cvec_pca16.npy',\n",
       " 'eo_ks_cvec_nmf16.npy',\n",
       " 'eo_ks_cvec_umap16.npy',\n",
       " 'eo_ks_cvec_tsne16.npy',\n",
       " 'eo_ks_lda100.npy',\n",
       " 'eo_ks_bert.npy',\n",
       " 'eo_ks_roberta.npy',\n",
       " 'eo_ks_distil.npy',\n",
       " 'eo_ks_glove6B.npy',\n",
       " 'eo_ks_universal.npy',\n",
       " 'stwts_ks_cvec_pca16.npy',\n",
       " 'stwts_ks_cvec_nmf16.npy',\n",
       " 'stwts_ks_cvec_umap16.npy',\n",
       " 'stwts_ks_cvec_tsne16.npy',\n",
       " 'stwts_ks_lda100.npy',\n",
       " 'stwts_ks_bert.npy',\n",
       " 'stwts_ks_roberta.npy',\n",
       " 'stwts_ks_distil.npy',\n",
       " 'stwts_ks_glove6B.npy',\n",
       " 'stwts_ks_universal.npy']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "saved_list = []\n",
    "for i in range(len(dataset_names)):\n",
    "  for j in range(len(embed_types)):\n",
    "    data = pd.read_csv(\"data/output/\"+dataset_names[i] + '_' + embed_types[j] + '_full.csv', index_col=0)\n",
    "    data = data.to_numpy()\n",
    "    print(dataset_names[i] + '_' + embed_types[j] + '_full.csv')\n",
    "    print('Creating ks matrix...')\n",
    "    saved_list.append(si.get_ks_matrix(data, dataset_names[i], embed_types[j]))\n",
    "print()\n",
    "saved_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 13429772,
     "status": "ok",
     "timestamp": 1616722391370,
     "user": {
      "displayName": "Apurva Bhargava",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gi3hHA-32IVQPOzXK40Itcc5oZmMDf0Vsnw_e_afg=s64",
      "userId": "07288249218888651888"
     },
     "user_tz": 240
    },
    "id": "EUQSvej7Hg7Y",
    "outputId": "c59c6923-cec5-4331-d45d-f09c7d9e225e"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eo_cvec_pca16_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_cvec_nmf16_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_cvec_umap16_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_cvec_tsne16_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_lda100_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_bert_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_roberta_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_distil_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_glove6B_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "eo_universal_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "stwts_cvec_pca16_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "stwts_cvec_nmf16_full.csv\n",
      "Creating cos matrix...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ak8096\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\scipy\\spatial\\distance.py:630: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  dist = 1.0 - uv / np.sqrt(uu * vv)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving cosine matrix...\n",
      "stwts_cvec_umap16_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "stwts_cvec_tsne16_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "stwts_lda100_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "stwts_bert_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "stwts_roberta_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "stwts_distil_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "stwts_glove6B_full.csv\n",
      "Creating cos matrix...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ak8096\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\scipy\\spatial\\distance.py:630: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  dist = 1.0 - uv / np.sqrt(uu * vv)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving cosine matrix...\n",
      "stwts_universal_full.csv\n",
      "Creating cos matrix...\n",
      "Saving cosine matrix...\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['eo_cos_cvec_pca16.npy',\n",
       " 'eo_cos_cvec_nmf16.npy',\n",
       " 'eo_cos_cvec_umap16.npy',\n",
       " 'eo_cos_cvec_tsne16.npy',\n",
       " 'eo_cos_lda100.npy',\n",
       " 'eo_cos_bert.npy',\n",
       " 'eo_cos_roberta.npy',\n",
       " 'eo_cos_distil.npy',\n",
       " 'eo_cos_glove6B.npy',\n",
       " 'eo_cos_universal.npy',\n",
       " 'stwts_cos_cvec_pca16.npy',\n",
       " 'stwts_cos_cvec_nmf16.npy',\n",
       " 'stwts_cos_cvec_umap16.npy',\n",
       " 'stwts_cos_cvec_tsne16.npy',\n",
       " 'stwts_cos_lda100.npy',\n",
       " 'stwts_cos_bert.npy',\n",
       " 'stwts_cos_roberta.npy',\n",
       " 'stwts_cos_distil.npy',\n",
       " 'stwts_cos_glove6B.npy',\n",
       " 'stwts_cos_universal.npy']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "saved_list = []\n",
    "for i in range(len(dataset_names)):\n",
    "  for j in range(len(embed_types)):\n",
    "    data = pd.read_csv(\"data/output/\"+dataset_names[i] + '_' + embed_types[j] + '_full.csv', index_col=0)\n",
    "    data = data.to_numpy()\n",
    "    print(dataset_names[i] + '_' + embed_types[j] + '_full.csv')\n",
    "    print('Creating cos matrix...')\n",
    "    saved_list.append(si.get_cos_matrix(data, dataset_names[i], embed_types[j]))\n",
    "print()\n",
    "saved_list"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyOijvDKcA2lyThmMHwukVk7",
   "collapsed_sections": [],
   "name": "GenDistMatrices.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
